Business Case #4 - Recommender System

Authors:

Débora Santos (m20200748),Pedro Henrique Medeiros (m20200742), Rebeca Pinheiro (m20201096)

Group D - D4B Consulting

In [1]:
#IMPORT LIBRARIES
import sqlite3
import os
import pandas as pd
import numpy as np
import random
from numpy import linalg as LA
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime as dt
from sklearn.impute import KNNImputer
from sklearn.cluster import KMeans, AgglomerativeClustering, MeanShift, DBSCAN, estimate_bandwidth
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder
from itertools import product
from math import ceil
from scipy.cluster.hierarchy import dendrogram
%matplotlib inline
#from pandas_profiling import ProfileReport 
%config InlineBackend.figure_format = 'retina' 
from scipy.stats import iqr as IQR
from collections import Counter
import scipy.stats as stat
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import coo_matrix, csr_matrix
from implicit.als import AlternatingLeastSquares
from implicit.bpr import BayesianPersonalizedRanking
from implicit.lmf import LogisticMatrixFactorization
from implicit.evaluation import ranking_metrics_at_k
from tqdm import tqdm
import plotly.graph_objects as go
from sklearn.metrics import roc_curve, auc
from scipy.sparse import csr_matrix
import scipy.sparse as sparse
import warnings
warnings.filterwarnings('ignore')

# Set seaborn style
sns.set()

Collect initial data

In [2]:
#import file csv
df = pd.read_csv('retail.csv')
#pd.set_option('display.max_rows', 500)
#pd.set_option('display.max_columns', 500)
#pd.set_option('display.width', 1000)

Describe, explore and assess data quality

In [3]:
#Show first 5 rows of data
df.head()
Out[3]:
InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID Country
0 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 12/1/2010 8:26 2.55 17850.0 United Kingdom
1 536365 71053 WHITE METAL LANTERN 6 12/1/2010 8:26 3.39 17850.0 United Kingdom
2 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 12/1/2010 8:26 2.75 17850.0 United Kingdom
3 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 12/1/2010 8:26 3.39 17850.0 United Kingdom
4 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 12/1/2010 8:26 3.39 17850.0 United Kingdom
In [4]:
#Show last 5 rows of data
df.tail()
Out[4]:
InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID Country
541904 581587 22613 PACK OF 20 SPACEBOY NAPKINS 12 12/9/2011 12:50 0.85 12680.0 France
541905 581587 22899 CHILDREN'S APRON DOLLY GIRL 6 12/9/2011 12:50 2.10 12680.0 France
541906 581587 23254 CHILDRENS CUTLERY DOLLY GIRL 4 12/9/2011 12:50 4.15 12680.0 France
541907 581587 23255 CHILDRENS CUTLERY CIRCUS PARADE 4 12/9/2011 12:50 4.15 12680.0 France
541908 581587 22138 BAKING SET 9 PIECE RETROSPOT 3 12/9/2011 12:50 4.95 12680.0 France
In [5]:
#Show info about the data
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB
In [6]:
#Replace empty cells by nam
df.replace("", np.nan, inplace=True)
In [7]:
# Summary statistics for all variables
df.describe(include='all').transpose()
Out[7]:
count unique top freq mean std min 25% 50% 75% max
InvoiceNo 541909 25900 573585 1114 NaN NaN NaN NaN NaN NaN NaN
StockCode 541909 4070 85123A 2313 NaN NaN NaN NaN NaN NaN NaN
Description 540455 4223 WHITE HANGING HEART T-LIGHT HOLDER 2369 NaN NaN NaN NaN NaN NaN NaN
Quantity 541909 NaN NaN NaN 9.55225 218.081 -80995 1 3 10 80995
InvoiceDate 541909 23260 10/31/2011 14:41 1114 NaN NaN NaN NaN NaN NaN NaN
UnitPrice 541909 NaN NaN NaN 4.61111 96.7599 -11062.1 1.25 2.08 4.13 38970
CustomerID 406829 NaN NaN NaN 15287.7 1713.6 12346 13953 15152 16791 18287
Country 541909 38 United Kingdom 495478 NaN NaN NaN NaN NaN NaN NaN
In [8]:
# Function to show missing values and their percentages
def missing_data(df):
    l = []
    for col in df.columns:
        missing = df[col].isnull().sum(axis=0)
        percentage = missing/df.shape[0]
        l.append([col,missing,percentage*100])
        df_missing = pd.DataFrame(l, columns = ['Feature','missing','percentage'])
        df_missing = df_missing.sort_values('missing',axis=0, ascending = False)    
    return df_missing
In [9]:
#Check Missing data
df_missing = missing_data(df)
df_missing = df_missing[df_missing['missing']>0]
df_missing
Out[9]:
Feature missing percentage
6 CustomerID 135080 24.926694
2 Description 1454 0.268311
In [10]:
# Check duplicated observations
df.loc[df.duplicated(keep=False), :]
Out[10]:
InvoiceNo StockCode Description Quantity InvoiceDate UnitPrice CustomerID Country
485 536409 22111 SCOTTIE DOG HOT WATER BOTTLE 1 12/1/2010 11:45 4.95 17908.0 United Kingdom
489 536409 22866 HAND WARMER SCOTTY DOG DESIGN 1 12/1/2010 11:45 2.10 17908.0 United Kingdom
494 536409 21866 UNION JACK FLAG LUGGAGE TAG 1 12/1/2010 11:45 1.25 17908.0 United Kingdom
517 536409 21866 UNION JACK FLAG LUGGAGE TAG 1 12/1/2010 11:45 1.25 17908.0 United Kingdom
521 536409 22900 SET 2 TEA TOWELS I LOVE LONDON 1 12/1/2010 11:45 2.95 17908.0 United Kingdom
... ... ... ... ... ... ... ... ...
541675 581538 22068 BLACK PIRATE TREASURE CHEST 1 12/9/2011 11:34 0.39 14446.0 United Kingdom
541689 581538 23318 BOX OF 6 MINI VINTAGE CRACKERS 1 12/9/2011 11:34 2.49 14446.0 United Kingdom
541692 581538 22992 REVOLVER WOODEN RULER 1 12/9/2011 11:34 1.95 14446.0 United Kingdom
541699 581538 22694 WICKER STAR 1 12/9/2011 11:34 2.10 14446.0 United Kingdom
541701 581538 23343 JUMBO BAG VINTAGE CHRISTMAS 1 12/9/2011 11:34 2.08 14446.0 United Kingdom

10147 rows × 8 columns

In [11]:
# Create a dataframe copy
df1 = df.copy()

Data Preparation

Clean data

In [12]:
# Drop duplicated rows
df1.drop_duplicates(inplace=True)
In [13]:
# Select irregular stock code values
irregular_stockcode = ['DCGS0076','DCGS0003','DCGS0070','DCGS0055', 'DCGS0072', 'DCGS0074', 'DCGS0069', 'DCGS0057',
                       'DCGSSBOY', 'DCGSSGIRL', 'DCGS0004', 'DCGS0073', 'DCGS0071', 'DCGS0068', 'DCGS0067', 'DCGS0066P',
                       'POST', 'D', 'DOT','M', 'BANK CHARGES', 'S', 'AMAZONFEE', 'gift_0001_40', 'gift_0001_50', 
                       'gift_0001_30', 'gift_0001_20','gift_0001_10', 'PADS', 'B', 'CRUK', 'c2','C2','m']
In [14]:
# Remove records with irregular stock code values
df1 = df1[~  df1['StockCode'].isin(irregular_stockcode)]
In [15]:
#Remove transactions with price =0.  
df1 = df1[df1['UnitPrice']>0]
In [16]:
print('Percentage of data kept after removing irregularities:', np.round(df1.shape[0] / df.shape[0], 4))
Percentage of data kept after removing irregularities: 0.9802
In [17]:
# 'Unspecified' will be kept because it does not affect the collaborative filtering algorithm
df1['Country'].value_counts()
Out[17]:
United Kingdom          486167
Germany                   9080
France                    8218
EIRE                      8059
Spain                     2462
Netherlands               2326
Belgium                   1971
Switzerland               1960
Portugal                  1466
Australia                 1253
Norway                    1059
Italy                      783
Channel Islands            752
Finland                    653
Cyprus                     608
Unspecified                442
Sweden                     436
Austria                    387
Denmark                    375
Japan                      355
Poland                     336
Israel                     294
USA                        291
Hong Kong                  276
Singapore                  215
Iceland                    182
Canada                     150
Greece                     142
Malta                      123
United Arab Emirates        67
European Community          58
RSA                         57
Lebanon                     45
Lithuania                   35
Brazil                      32
Czech Republic              28
Bahrain                     19
Saudi Arabia                10
Name: Country, dtype: int64
In [18]:
#Remove duplicate descriptions to the same stockCode. The remaining ones are very similar so doesn't matter each one will be dropped; 
#Create a new dataframe with the columns StockCode and Description
df_stock_desc = df1[['StockCode','Description']]
#Drop duplicates based on StockCode column
df_stock_desc = df_stock_desc.drop_duplicates(subset='StockCode', keep="last")     
#Create a copy of df1
df2 = df1.copy()
#Merge df2 with the df_stock_desc to have only one description by StockCode
df3 = pd.merge(df2,df_stock_desc,on = 'StockCode', how = 'left')
#Check the results
df3
Out[18]:
InvoiceNo StockCode Description_x Quantity InvoiceDate UnitPrice CustomerID Country Description_y
0 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 12/1/2010 8:26 2.55 17850.0 United Kingdom CREAM HANGING HEART T-LIGHT HOLDER
1 536365 71053 WHITE METAL LANTERN 6 12/1/2010 8:26 3.39 17850.0 United Kingdom WHITE MOROCCAN METAL LANTERN
2 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 12/1/2010 8:26 2.75 17850.0 United Kingdom CREAM CUPID HEARTS COAT HANGER
3 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 12/1/2010 8:26 3.39 17850.0 United Kingdom KNITTED UNION FLAG HOT WATER BOTTLE
4 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 12/1/2010 8:26 3.39 17850.0 United Kingdom RED WOOLLY HOTTIE WHITE HEART.
... ... ... ... ... ... ... ... ... ...
531167 581587 22613 PACK OF 20 SPACEBOY NAPKINS 12 12/9/2011 12:50 0.85 12680.0 France PACK OF 20 SPACEBOY NAPKINS
531168 581587 22899 CHILDREN'S APRON DOLLY GIRL 6 12/9/2011 12:50 2.10 12680.0 France CHILDREN'S APRON DOLLY GIRL
531169 581587 23254 CHILDRENS CUTLERY DOLLY GIRL 4 12/9/2011 12:50 4.15 12680.0 France CHILDRENS CUTLERY DOLLY GIRL
531170 581587 23255 CHILDRENS CUTLERY CIRCUS PARADE 4 12/9/2011 12:50 4.15 12680.0 France CHILDRENS CUTLERY CIRCUS PARADE
531171 581587 22138 BAKING SET 9 PIECE RETROSPOT 3 12/9/2011 12:50 4.95 12680.0 France BAKING SET 9 PIECE RETROSPOT

531172 rows × 9 columns

In [19]:
#Check if it worked - After remove duplicates based on StockCode + Description, we had only 3914 rows. Exactly the unique StockCodes that we have in original dataset. 
df4 = df3[['StockCode','Description_y']]
df4.drop_duplicates(inplace = True)     
df4
Out[19]:
StockCode Description_y
0 85123A CREAM HANGING HEART T-LIGHT HOLDER
1 71053 WHITE MOROCCAN METAL LANTERN
2 84406B CREAM CUPID HEARTS COAT HANGER
3 84029G KNITTED UNION FLAG HOT WATER BOTTLE
4 84029E RED WOOLLY HOTTIE WHITE HEART.
... ... ...
499195 85179a GREEN BITTY LIGHT CHAIN
502339 23617 SET 10 CARDS SWIRLY XMAS TREE 17104
516538 90214U LETTER "U" BLING KEY RING
526540 47591b SCOTTIES CHILDRENS APRON
529699 23843 PAPER CRAFT , LITTLE BIRDIE

3914 rows × 2 columns

In [20]:
#Drop the columns with the descriptions duplicated
df3.drop(['Description_x'], axis=1, inplace=True)
#Change the name of the column with the unique description.
df3.rename(columns = {'Description_y':'Description'}, inplace = True)
#Copy the results to df1. 
df1 = df3.copy()
df1
Out[20]:
InvoiceNo StockCode Quantity InvoiceDate UnitPrice CustomerID Country Description
0 536365 85123A 6 12/1/2010 8:26 2.55 17850.0 United Kingdom CREAM HANGING HEART T-LIGHT HOLDER
1 536365 71053 6 12/1/2010 8:26 3.39 17850.0 United Kingdom WHITE MOROCCAN METAL LANTERN
2 536365 84406B 8 12/1/2010 8:26 2.75 17850.0 United Kingdom CREAM CUPID HEARTS COAT HANGER
3 536365 84029G 6 12/1/2010 8:26 3.39 17850.0 United Kingdom KNITTED UNION FLAG HOT WATER BOTTLE
4 536365 84029E 6 12/1/2010 8:26 3.39 17850.0 United Kingdom RED WOOLLY HOTTIE WHITE HEART.
... ... ... ... ... ... ... ... ...
531167 581587 22613 12 12/9/2011 12:50 0.85 12680.0 France PACK OF 20 SPACEBOY NAPKINS
531168 581587 22899 6 12/9/2011 12:50 2.10 12680.0 France CHILDREN'S APRON DOLLY GIRL
531169 581587 23254 4 12/9/2011 12:50 4.15 12680.0 France CHILDRENS CUTLERY DOLLY GIRL
531170 581587 23255 4 12/9/2011 12:50 4.15 12680.0 France CHILDRENS CUTLERY CIRCUS PARADE
531171 581587 22138 3 12/9/2011 12:50 4.95 12680.0 France BAKING SET 9 PIECE RETROSPOT

531172 rows × 8 columns

In [21]:
# Check missing data
df_missing = missing_data(df1)
df_missing = df_missing[df_missing['missing']>0]
df_missing
Out[21]:
Feature missing percentage
5 CustomerID 131516 24.759588
In [22]:
# Change 'InvoiceDate' data type to datetime, and create new column with the date respective year and month
df1['InvoiceDate'] = df1['InvoiceDate'].astype('datetime64[ns]')
df1['Month_Year'] = df1['InvoiceDate'].dt.to_period('M')
df1
Out[22]:
InvoiceNo StockCode Quantity InvoiceDate UnitPrice CustomerID Country Description Month_Year
0 536365 85123A 6 2010-12-01 08:26:00 2.55 17850.0 United Kingdom CREAM HANGING HEART T-LIGHT HOLDER 2010-12
1 536365 71053 6 2010-12-01 08:26:00 3.39 17850.0 United Kingdom WHITE MOROCCAN METAL LANTERN 2010-12
2 536365 84406B 8 2010-12-01 08:26:00 2.75 17850.0 United Kingdom CREAM CUPID HEARTS COAT HANGER 2010-12
3 536365 84029G 6 2010-12-01 08:26:00 3.39 17850.0 United Kingdom KNITTED UNION FLAG HOT WATER BOTTLE 2010-12
4 536365 84029E 6 2010-12-01 08:26:00 3.39 17850.0 United Kingdom RED WOOLLY HOTTIE WHITE HEART. 2010-12
... ... ... ... ... ... ... ... ... ...
531167 581587 22613 12 2011-12-09 12:50:00 0.85 12680.0 France PACK OF 20 SPACEBOY NAPKINS 2011-12
531168 581587 22899 6 2011-12-09 12:50:00 2.10 12680.0 France CHILDREN'S APRON DOLLY GIRL 2011-12
531169 581587 23254 4 2011-12-09 12:50:00 4.15 12680.0 France CHILDRENS CUTLERY DOLLY GIRL 2011-12
531170 581587 23255 4 2011-12-09 12:50:00 4.15 12680.0 France CHILDRENS CUTLERY CIRCUS PARADE 2011-12
531171 581587 22138 3 2011-12-09 12:50:00 4.95 12680.0 France BAKING SET 9 PIECE RETROSPOT 2011-12

531172 rows × 9 columns

In [23]:
# Create new dataframe
df_rs = df1.dropna().copy()
In [24]:
# Get max CustomerID
idMax = df1['CustomerID'].max()

# Get distinct invoice numbers for records where CustomerID is null
invoicesNullCustomer = df1[df1['CustomerID'].isnull()]['InvoiceNo'].unique()

# fill up the records where the customer id is null, giving a new customer id for the records containing the same invoice numbers
next_customerId = idMax+1
for invoiceNo in invoicesNullCustomer:
    df1.loc[df1['InvoiceNo'] == invoiceNo, 'CustomerID'] = next_customerId
    next_customerId +=1
In [25]:
# Check missing data
df_missing = missing_data(df1)
df_missing = df_missing[df_missing['missing']>0]
df_missing
Out[25]:
Feature missing percentage
In [26]:
# Show data info
df1.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 531172 entries, 0 to 531171
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    531172 non-null  object        
 1   StockCode    531172 non-null  object        
 2   Quantity     531172 non-null  int64         
 3   InvoiceDate  531172 non-null  datetime64[ns]
 4   UnitPrice    531172 non-null  float64       
 5   CustomerID   531172 non-null  float64       
 6   Country      531172 non-null  object        
 7   Description  531172 non-null  object        
 8   Month_Year   531172 non-null  period[M]     
dtypes: datetime64[ns](1), float64(2), int64(1), object(4), period[M](1)
memory usage: 40.5+ MB
In [27]:
# dataframe basic value stats
df1.describe(include='all').transpose()
Out[27]:
count unique top freq first last mean std min 25% 50% 75% max
InvoiceNo 531172 23195 573585 1112 NaT NaT NaN NaN NaN NaN NaN NaN NaN
StockCode 531172 3914 85123A 2295 NaT NaT NaN NaN NaN NaN NaN NaN NaN
Quantity 531172 NaN NaN NaN NaT NaT 9.96149 217.015 -80995 1 3 10 80995
InvoiceDate 531172 21308 2011-10-31 14:41:00 1112 2010-12-01 08:26:00 2011-12-09 12:50:00 NaN NaN NaN NaN NaN NaN NaN
UnitPrice 531172 NaN NaN NaN NaT NaT 3.29253 4.51292 0.03 1.25 2.08 4.13 649.5
CustomerID 531172 NaN NaN NaN NaT NaT 16222 2213.44 12346 14367 16241 18260 19697
Country 531172 38 United Kingdom 486167 NaT NaT NaN NaN NaN NaN NaN NaN NaN
Description 531172 3788 CREAM HANGING HEART T-LIGHT HOLDER 2295 NaT NaT NaN NaN NaN NaN NaN NaN NaN
Month_Year 531172 13 2011-11 82739 NaT NaT NaN NaN NaN NaN NaN NaN NaN

Data Exploration

In [28]:
#FINDING THE SALES PROPORTION OF EACH PRODUCT
#CREATE A GROUPED DATAFRAME BY STOCK CODE AND DESCRIPTION WITH QUANTITY SUMMED
selling_prods = pd.DataFrame(df1.groupby(['Description','StockCode']).sum()['Quantity'])
#FILTER ONLY PRODUCTS WITH QUANTITY SOLD HIGHER THAN 0
selling_prods = selling_prods[selling_prods['Quantity'] > 0]
#ORDER THE DATASET BY HIGHER QUANTITY FIRST
selling_prods.sort_values(by=['Quantity'], inplace= True, ascending=False)
#Create a % of itens sold of each product over the total
selling_prods['percentage'] = round(selling_prods.Quantity /selling_prods.Quantity.sum() *100,3)
#Create a cummulative % of itens sold
selling_prods['cumpercentage'] = selling_prods['percentage'].cumsum()
selling_prods.head(50)
Out[28]:
Quantity percentage cumpercentage
Description StockCode
POPCORN HOLDER 22197 56427 1.066 1.066
WORLD WAR 2 GLIDERS ASSTD DESIGNS 84077 53751 1.016 2.082
JUMBO BAG RED RETROSPOT 85099B 47256 0.893 2.975
ASSORTED COLOUR BIRD ORNAMENT 84879 36282 0.686 3.661
PACK OF 72 RETROSPOT CAKE CASES 21212 36016 0.681 4.342
CREAM HANGING HEART T-LIGHT HOLDER 85123A 35063 0.663 5.005
RABBIT NIGHT LIGHT 23084 30631 0.579 5.584
MINI PAINT SET VINTAGE 22492 26437 0.500 6.084
PACK OF 12 LONDON TISSUES 22616 26095 0.493 6.577
PACK OF 60 PINK PAISLEY CAKE CASES 21977 24719 0.467 7.044
VICTORIAN GLASS HANGING T-LIGHT 22178 23825 0.450 7.494
ASSORTED COLOURS SILK FAN 15036 23082 0.436 7.930
BROCADE RING PURSE 17003 23017 0.435 8.365
RED HARMONICA IN BOX 21915 21836 0.413 8.778
JUMBO BAG PINK POLKADOT 22386 20992 0.397 9.175
JUMBO BAG VINTAGE DOILY 23203 19895 0.376 9.551
PAPER CHAIN KIT 50'S CHRISTMAS 22086 18876 0.357 9.908
ANTIQUE SILVER T-LIGHT GLASS 84946 18874 0.357 10.265
LUNCH BAG RED RETROSPOT 20725 18858 0.356 10.621
60 TEATIME FAIRY CAKE CASES 84991 18015 0.340 10.961
PARTY BUNTING 47566 18006 0.340 11.301
CHARLOTTE BAG SUKI DESIGN 22355 17974 0.340 11.641
HEART OF WICKER SMALL 22469 17828 0.337 11.978
RED RETROSPOT CHARLOTTE BAG 20724 17538 0.331 12.309
JUMBO BAG STRAWBERRY 85099F 16987 0.321 12.630
COLOUR GLASS T-LIGHT HOLDER HANGING 84755 16332 0.309 12.939
GROW A FLYTRAP OR SUNFLOWER IN TIN 22693 16172 0.306 13.245
JAM MAKING SET PRINTED 22961 16065 0.304 13.549
60 CAKE CASES VINTAGE CHRISTMAS 22952 15744 0.298 13.847
PACK OF 72 SKULL CAKE CASES 21213 15121 0.286 14.133
VINTAGE SNAP CARDS 21790 14436 0.273 14.406
PLACE SETTING WHITE HEART 22151 14236 0.269 14.675
SPACEBOY LUNCH BOX 22629 14195 0.268 14.943
JUMBO BAG APPLES 23199 14185 0.268 15.211
RED TOADSTOOL LED NIGHT LIGHT 21731 13815 0.261 15.472
JUMBO BAG BAROQUE BLACK WHITE 85099C 13619 0.257 15.729
HANGING JAM JAR T-LIGHT HOLDERS 71459 13476 0.255 15.984
SMALL CHINESE STYLE SCISSOR 16014 13328 0.252 16.236
JUMBO STORAGE BAG SUKI 21931 13317 0.252 16.488
72 SWEETHEART FAIRY CAKE CASES 84992 13189 0.249 16.737
SET/20 RED RETROSPOT PAPER NAPKINS 21080 13185 0.249 16.986
WOODLAND CHARLOTTE BAG 20719 13178 0.249 17.235
LUNCH BAG VINTAGE DOILY 23209 13130 0.248 17.483
REGENCY CAKESTAND 3 TIER 22423 12996 0.246 17.729
DISCO BALL CHRISTMAS DECORATION 20668 12883 0.243 17.972
JUMBO BAG ALPHABET 23201 12772 0.241 18.213
SET OF 4 PANTRY JELLY MOULDS 22993 12581 0.238 18.451
GIRLS ALPHABET IRON ON PATCHES 84568 12442 0.235 18.686
HOMEMADE JAM SCENTED CANDLES 22969 12380 0.234 18.920
LUNCH BAG SUKI DESIGN 22383 12370 0.234 19.154

It's possible to see that the sales are distributed by many products. The highest proportion is about 1,07%

In [29]:
best_selling_prods = selling_prods.head(10)
best_selling_prods.reset_index(level=0, inplace=True)

top10products=best_selling_prods.plot(x='Description', y= 'Quantity',rot=80, kind='bar')
top10products.set_ylabel('Products Sold')
plt.show()
In [30]:
#Find the Proportion of sales by country
#Create a grouped Dataframe by Country
selling_country = pd.DataFrame(df1.groupby(['Country']).sum()['Quantity'])

#Order the dataset for the higher quantity 
selling_country.sort_values(by=['Quantity'], inplace= True, ascending=False)
#Create a % of quantity sold of each country over the total
selling_country['percentage'] = round(selling_country.Quantity /selling_country.Quantity.sum() *100,3)
#Create a cummulative % of itens sold
#selling_prods['cumpercentage'] = selling_prods['percentage'].cumsum()
selling_country.reset_index(inplace = True)
selling_country
Out[30]:
Country Quantity percentage
0 United Kingdom 4382896 82.833
1 Netherlands 199934 3.779
2 EIRE 142120 2.686
3 Germany 116234 2.197
4 France 109648 2.072
5 Australia 83335 1.575
6 Sweden 35594 0.673
7 Switzerland 30215 0.571
8 Spain 26600 0.503
9 Japan 25221 0.477
10 Belgium 22880 0.432
11 Norway 19188 0.363
12 Portugal 16055 0.303
13 Finland 10575 0.200
14 Channel Islands 9470 0.179
15 Denmark 8147 0.154
16 Italy 7957 0.150
17 Cyprus 6295 0.119
18 Singapore 5234 0.099
19 Austria 4790 0.091
20 Hong Kong 4707 0.089
21 Israel 4350 0.082
22 Poland 3644 0.069
23 Unspecified 3295 0.062
24 Canada 2762 0.052
25 Iceland 2458 0.046
26 Greece 1550 0.029
27 USA 1034 0.020
28 United Arab Emirates 981 0.019
29 Malta 934 0.018
30 Lithuania 652 0.012
31 Czech Republic 592 0.011
32 European Community 488 0.009
33 Lebanon 386 0.007
34 Brazil 356 0.007
35 RSA 351 0.007
36 Bahrain 260 0.005
37 Saudi Arabia 75 0.001
In [31]:
#Define the labels and the data
labels =selling_country['Country']
values = selling_country['percentage']
data = dict(type='pie',labels=labels,
                        values=values,)
#Define the layout
layout = dict(title=dict(text='Volume by Country - % share'))
#Create the figure
fig = go.Figure(data= data, layout=layout)
fig.update_traces(textposition='inside', textinfo='percent+label')
#fig.update_layout(showlegend=False, uniformtext_minsize=12, uniformtext_mode='hide')
fig.show()
In [32]:
#FINDING THE MOST SELLING PRODUCT BY COUNTRY
selling_prods_country = df1.groupby(['Country', 'StockCode','Description'])['Quantity'].agg('sum').reset_index()
max_selling_prods_country = selling_prods_country.groupby("Country").max()
max_selling_prods_country
Out[32]:
StockCode Description Quantity
Country
Australia 85183B YELLOW GIANT GARDEN THERMOMETER 2916
Austria 90013A ZINC WILLIE WINKIE CANDLE STICK 288
Bahrain 85040A VANILLA SCENT CANDLE JEWELLED BOX 96
Belgium 90164A WRAP SUKI AND FRIENDS 480
Brazil 84971S SPACEBOY LUNCH BOX 24
Canada 85232D YELLOW COAT RACK PARIS FASHION 504
Channel Islands 85150 ZINC T-LIGHT HOLDER STARS SMALL 407
Cyprus 85232D ZINC METAL HEART DECORATION 384
Czech Republic 85206A WOODEN TREE CHRISTMAS SCANDINAVIAN 72
Denmark 85227 WRAP VINTAGE LEAF DESIGN 288
EIRE 90209C ZINC WIRE SWEETHEART LETTER TRAY 1809
European Community 85132C WHITE ROCKING HORSE HAND PAINTED 24
Finland 90030C ZINC T-LIGHT HOLDER STARS SMALL 552
France 90201C ZINC T-LIGHT HOLDER STARS SMALL 4023
Germany 90204 ZINC WILLIE WINKIE CANDLE STICK 1206
Greece 85215 ZINC METAL HEART DECORATION 48
Hong Kong 85183B WORLD WAR 2 GLIDERS ASSTD DESIGNS 150
Iceland 85232D WOODLAND DESIGN COTTON TOTE BAG 240
Israel 90099 YELLOW COAT RACK PARIS FASHION 130
Italy 85178 ZINC FOLKART SLEIGH BELLS 240
Japan 85232D WORLD WAR 2 GLIDERS ASSTD DESIGNS 3401
Lebanon 85150 WOODEN SKITTLES GARDEN SET 24
Lithuania 85206A SILVER MUG BONE CHINA TREE OF LIFE 48
Malta 85230A WRAP CHRISTMAS VILLAGE 75
Netherlands 90057 ZINC WILLIE WINKIE CANDLE STICK 4801
Norway 85232D ZINC WILLIE WINKIE CANDLE STICK 576
Poland 85094 ZINC WILLIE WINKIE CANDLE STICK 168
Portugal 90162B ZINC WIRE KITCHEN ORGANISER 240
RSA 85099B WOODEN CROQUET GARDEN SET 12
Saudi Arabia 22969 PLASTERS IN TIN STRONGMAN 12
Singapore 85150 ZINC FOLKART SLEIGH BELLS 384
Spain 90192 ZINC T-LIGHT HOLDER STARS SMALL 1089
Sweden 85232B ZINC WILLIE WINKIE CANDLE STICK 2916
Switzerland 90114 ZINC WILLIE WINKIE CANDLE STICK 639
USA 85099B WRAP ENGLISH ROSE 88
United Arab Emirates 85215 WRAP PAISLEY PARK 72
United Kingdom 90214Z ZINC WIRE SWEETHEART LETTER TRAY 52905
Unspecified 85227 ZINC METAL HEART DECORATION 96
In [33]:
#Distribution the volume sold by month
by_month=df1.groupby(['Month_Year'])['Quantity'].agg('sum')
by_month.plot()
Out[33]:
<AxesSubplot:xlabel='Month_Year'>
In [34]:
#Show data
by_month
Out[34]:
Month_Year
2010-12    341546
2011-01    306953
2011-02    280021
2011-03    372026
2011-04    294048
2011-05    390584
2011-06    380935
2011-07    393318
2011-08    409028
2011-09    561713
2011-10    597338
2011-11    734645
2011-12    229108
Freq: M, Name: Quantity, dtype: int64
In [35]:
#Remove the rows of each invoice having unique invoices
number_orders = df1.drop_duplicates(subset='InvoiceNo', keep="last")   
#grouped by Customer id
number_orders = number_orders['CustomerID'].value_counts()
#Sort the values by the customer with highest number of invoices
number_orders = number_orders.sort_values(ascending=False,axis=0)
#Reset index
number_orders = number_orders.reset_index()
#Rename columns
number_orders.rename(columns={'index': 'customer_id'}, inplace = True)
number_orders.rename(columns={'CustomerID': 'number_invoices'}, inplace = True)
#Group the dataframe by quantity of customers according by number of invoice placed
number_orders= number_orders.groupby("number_invoices")['customer_id'].agg(['count'])
#Reset index
number_orders = number_orders.reset_index()
#Show
number_orders
Out[35]:
number_invoices count
0 1 2737
1 2 814
2 3 490
3 4 380
4 5 284
... ... ...
58 118 2
59 125 1
60 169 1
61 217 1
62 242 1

63 rows × 2 columns

In [36]:
#Create a Plotly Visualization to be included in dashboard - Line plot
number_orders5 = number_orders[number_orders['count']>5]
#Define the dataset to be used
df = number_orders5
#Define the data to plot
data = dict(type='scatter',
                  x=df['number_invoices'],
                  y=df['count'],
                  name='Quantity of customers by quantity of invoices'
                  )
#Define the layout
layout = dict(title=dict(text='Quantity of invoices'),
                  xaxis=dict(title='Number of invoices'),
                  yaxis=dict(title='Quantity of customers')
                  )
#Create the figure
fig_users = go.Figure(data, layout)
fig_users.show()
In [37]:
#CANCELED ITEMS
cancelitems= df1.loc[df1['Quantity'] < 0]
cancelitems
Out[37]:
InvoiceNo StockCode Quantity InvoiceDate UnitPrice CustomerID Country Description Month_Year
152 C536383 35004C -1 2010-12-01 09:49:00 4.65 15311.0 United Kingdom SET OF 3 COLOURED FLYING DUCKS 2010-12
233 C536391 22556 -12 2010-12-01 10:24:00 1.65 17548.0 United Kingdom PLASTERS IN TIN CIRCUS PARADE 2010-12
234 C536391 21984 -24 2010-12-01 10:24:00 0.29 17548.0 United Kingdom PACK OF 12 PINK PAISLEY TISSUES 2010-12
235 C536391 21983 -24 2010-12-01 10:24:00 0.29 17548.0 United Kingdom PACK OF 12 BLUE PAISLEY TISSUES 2010-12
236 C536391 21980 -24 2010-12-01 10:24:00 0.29 17548.0 United Kingdom PACK OF 12 RED RETROSPOT TISSUES 2010-12
... ... ... ... ... ... ... ... ... ...
529726 C581490 22178 -12 2011-12-09 09:57:00 1.95 14397.0 United Kingdom VICTORIAN GLASS HANGING T-LIGHT 2011-12
529727 C581490 23144 -11 2011-12-09 09:57:00 0.83 14397.0 United Kingdom ZINC T-LIGHT HOLDER STARS SMALL 2011-12
530981 C581568 21258 -5 2011-12-09 11:57:00 10.95 15311.0 United Kingdom VICTORIAN SEWING BOX LARGE 2011-12
530982 C581569 84978 -1 2011-12-09 11:58:00 1.25 17315.0 United Kingdom HANGING HEART JAR T-LIGHT HOLDER 2011-12
530983 C581569 20979 -5 2011-12-09 11:58:00 1.25 17315.0 United Kingdom 36 PENCILS TUBE RED RETROSPOT 2011-12

8668 rows × 9 columns

In [38]:
#MOST CANCELS ITEMS
cancelitemsprod=cancelitems.groupby(['StockCode','Description']).sum()['Quantity'].reset_index()
cancelitemsprod['Quantity']=cancelitemsprod['Quantity']*-1
cancelitemsprod = cancelitemsprod.sort_values(by='Quantity', ascending=False)
cancelitemsprod 
Out[38]:
StockCode Description Quantity
1507 23843 PAPER CRAFT , LITTLE BIRDIE 80995
1255 23166 MEDIUM CERAMIC TOP STORAGE JAR 74494
1663 84347 ROTATING SILVER ANGELS T-LIGHT HLDR 9376
133 21108 FAIRY CAKE FLANNEL ASSORTED COLOUR 3150
1831 85123A CREAM HANGING HEART T-LIGHT HOLDER 2578
... ... ... ...
739 22452 MEASURING TAPE BABUSHKA PINK 1
738 22450 SILK PURSE BABUSHKA BLUE 1
1731 84839 SWEETHEART KEY CABINET 1
1733 84843 WHITE SOAP RACK WITH 2 BOTTLES 1
1739 84857C PINK MONTE CARLO HANDBAG 1

1938 rows × 3 columns

In [39]:
top10cancelitems=cancelitemsprod.head(10)
top10cancelitems=top10cancelitems.plot(x='Description', y='Quantity', rot=80, kind='bar')
top10cancelitems.set_title('Top 10 products cancelled')
top10cancelitems.set_xlabel('ProductID')
top10cancelitems.set_ylabel('Quantity of cancelled products')
Out[39]:
Text(0, 0.5, 'Quantity of cancelled products')
In [40]:
#FINDING MOST COMMOM CANCELLED PRODUCT BY COUNTRY
cancelcountry=cancelitems.groupby(['Country','StockCode'])['Quantity'].agg('sum').reset_index()
min_cancelcountry=cancelcountry.groupby(['Country']).min()
min_cancelcountry['Quantity']=min_cancelcountry['Quantity']*-1
min_cancelcountry
Out[40]:
StockCode Quantity
Country
Australia 20725 120
Austria 22153 48
Bahrain 72802B 54
Belgium 20712 12
Channel Islands 20914 2
Cyprus 15056N 33
Czech Republic 22231 48
Denmark 16169E 25
EIRE 15056BL 288
European Community 22960 2
Finland 21216 27
France 20682 250
Germany 20676 288
Hong Kong 21843 1
Israel 22474 32
Italy 20980 13
Japan 20724 624
Malta 22065 6
Netherlands 21787 144
Norway 21791 12
Poland 21929 6
Portugal 21430 12
Saudi Arabia 22361 5
Spain 20914 288
Sweden 22077 240
Switzerland 21218 120
USA 16161P 36
United Kingdom 10133 80995

Collaborative Filtering Recommender System

Reduce sparsity

In [41]:
#Check info about dataframe
df_rs.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 399656 entries, 0 to 531171
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    399656 non-null  object        
 1   StockCode    399656 non-null  object        
 2   Quantity     399656 non-null  int64         
 3   InvoiceDate  399656 non-null  datetime64[ns]
 4   UnitPrice    399656 non-null  float64       
 5   CustomerID   399656 non-null  float64       
 6   Country      399656 non-null  object        
 7   Description  399656 non-null  object        
 8   Month_Year   399656 non-null  period[M]     
dtypes: datetime64[ns](1), float64(2), int64(1), object(4), period[M](1)
memory usage: 30.5+ MB
In [42]:
# Reduce sparcity by excluding users and items without significant event history
def threshold_ratings(df, uid_min, iid_min, max_iter=None):
    """Removes users and items with less than uid_min and iid_min event occurrences, respectively.
    Credits: https://www.ethanrosenthal.com/2016/10/19/implicit-mf-part-1/
    """
    n_users = df['CustomerID'].nunique()
    n_items = df['StockCode'].nunique()
    sparsity = float(df.shape[0]) / float(n_users * n_items) * 100
    print('Raw dataset info \n-----------------')
    print('Number of users: {}'.format(n_users))
    print('Number of items: {}'.format(n_items))
    print('Sparsity: {:4.3f}%'.format(sparsity))
    
    done, i = False, 0
    while not done:
        # When we exclude users with freq less than uid_min we might end up with new 
        # items with freq less than iid_min, so we will have to alternate back and forth
        starting_shape = df.shape[0]  # number of existing events

        uid_counts = df.groupby('CustomerID').size()  # user id frequencies
        df = df[~df['CustomerID'].isin(uid_counts[uid_counts < uid_min].index.tolist())]  # keep events with users with frequency >= uid_min

        iid_counts = df.groupby('StockCode').size()  # item id frequencies
        df = df[~df['StockCode'].isin(iid_counts[iid_counts < iid_min].index.tolist())]  # keep events with items with frequency >= iid_min

        ending_shape = df.shape[0]  # number of existing events after filters
        i += 1
        if starting_shape == ending_shape or i == max_iter:  # convergence happens
            done = True
    
    if not max_iter:
        assert(df.groupby('CustomerID').size().min() >= uid_min)
        assert(df.groupby('StockCode').size().min() >= iid_min)
    
    n_users = df['CustomerID'].nunique()
    n_items = df['StockCode'].nunique()
    sparsity = float(df.shape[0]) / float(n_users * n_items) * 100
    print('Limited dataset info \n-----------------')
    print('Number of iterations until convergence: {}'.format(i))
    print('Number of users: {}'.format(n_users))
    print('Number of items: {}'.format(n_items))
    print('Sparsity: {:4.3f}%'.format(sparsity))
    return df
In [43]:
# get limited dataset
df_limited = threshold_ratings(df_rs, 5, 5)
Raw dataset info 
-----------------
Number of users: 4362
Number of items: 3676
Sparsity: 2.492%
Limited dataset info 
-----------------
Number of iterations until convergence: 2
Number of users: 4120
Number of items: 3196
Sparsity: 3.023%
In [44]:
# Create dataframe grouped by users, products and summing up the quantities
# Instead of representing an explicit rating
# records with a larger number of quantity by a person can carry more weight in our matrix
grouped_df = df_limited.groupby(['CustomerID', 'StockCode'])['Quantity'].sum() 
grouped_df = pd.DataFrame(grouped_df)
grouped_df.reset_index(inplace=True)

# Show dataframe 
grouped_df
Out[44]:
CustomerID StockCode Quantity
0 12347.0 16008 24
1 12347.0 17021 36
2 12347.0 20665 6
3 12347.0 20719 40
4 12347.0 20780 12
... ... ... ...
265406 18287.0 84920 4
265407 18287.0 85039A 96
265408 18287.0 85039B 120
265409 18287.0 85040A 48
265410 18287.0 85173 48

265411 rows × 3 columns

In [45]:
# Remove rows where the quantity is 0 
grouped_df = grouped_df[grouped_df['Quantity'] > 0]

# Show dataframe
grouped_df
Out[45]:
CustomerID StockCode Quantity
0 12347.0 16008 24
1 12347.0 17021 36
2 12347.0 20665 6
3 12347.0 20719 40
4 12347.0 20780 12
... ... ... ...
265406 18287.0 84920 4
265407 18287.0 85039A 96
265408 18287.0 85039B 120
265409 18287.0 85040A 48
265410 18287.0 85173 48

263289 rows × 3 columns

In [46]:
# Change column types
grouped_df['Quantity'] = grouped_df['Quantity'].astype(float)
grouped_df['CustomerID'] = grouped_df['CustomerID'].astype("category")
grouped_df['StockCode'] = grouped_df['StockCode'].astype("category")

# Create two codes columns
grouped_df['Customer_ID'] = grouped_df['CustomerID'].cat.codes
grouped_df['Stock_Code'] = grouped_df['StockCode'].cat.codes

# Show dataframe
grouped_df
Out[46]:
CustomerID StockCode Quantity Customer_ID Stock_Code
0 12347.0 16008 24.0 0 23
1 12347.0 17021 36.0 0 68
2 12347.0 20665 6.0 0 105
3 12347.0 20719 40.0 0 134
4 12347.0 20780 12.0 0 173
... ... ... ... ... ...
265406 18287.0 84920 4.0 4103 2831
265407 18287.0 85039A 96.0 4103 2922
265408 18287.0 85039B 120.0 4103 2923
265409 18287.0 85040A 48.0 4103 2925
265410 18287.0 85173 48.0 4103 3019

263289 rows × 5 columns

Sparse matrices

In [47]:
# Create two matrices, one for fitting the model (item-user) and one for recommendations (user-item)
sparse_item_user = csr_matrix((grouped_df['Quantity'], (grouped_df['Stock_Code'], grouped_df['Customer_ID'])))
sparse_user_item = csr_matrix((grouped_df['Quantity'], (grouped_df['Customer_ID'], grouped_df['Stock_Code'])))

Training and validation set

Create functino to take in the original user-item matrix and "mask" a percentage of the original interactions where a user-item interaction has taken place for use as a test set. The test set will contain all of the original ratings, while the training set replaces the specified percentage of them with a zero in the original ratings matrix.

In [48]:
# Function to create the training and testing sets
def make_train(scores, pct_test = 0.2):

    # Make a copy of the original set to be the test set
    test_set = scores.copy() 

    # Store the test set as a binary preference matrix
    test_set[test_set != 0] = 1 

    # Make a copy of the original data that can be altered as the training set
    training_set = scores.copy()

    # Find the indices in the ratings data where an interaction exists
    nonzero_inds = training_set.nonzero()

    # Zip these pairs together of item,user index into list
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1]))

    # Set the random seed to zero for reproducibility
    random.seed(0) 

    # Round the number of samples needed to the nearest integer
    num_samples = int(np.ceil(pct_test*len(nonzero_pairs)))

    # Sample a random number of item-user pairs without replacement
    samples = random.sample(nonzero_pairs, num_samples)

    # Get the item row indices
    content_inds = [index[0] for index in samples]

    # Get the user column indices
    person_inds = [index[1] for index in samples] 

    # Assign all of the randomly chosen user-item pairs to zero
    training_set[content_inds, person_inds] = 0 

    # Get rid of zeros in sparse array storage after update to save space
    training_set.eliminate_zeros()

    # Return results
    return training_set, test_set, list(set(person_inds))
In [49]:
# Apply function and store the results
items_train, items_test, item_user_altered = make_train(sparse_item_user, pct_test = 0.2)

Fit models

Fit Alternating Least Square model

In [50]:
# Initialize the Alternating Least Squares (ALS) model with initial parameters
als_model = AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)

# Fit the model using the training sparse content-person matrix
alpha = 15
data = (items_train * alpha).astype('double')
als_model.fit(data)
WARNING:root:Intel MKL BLAS detected. Its highly recommend to set the environment variable 'export MKL_NUM_THREADS=1' to disable its internal multithreading

Fit Bayesian Personalized model

In [51]:
# Initialize BayesianPersonalizedRanking (BPR) model with initial parameters
bpr_model = BayesianPersonalizedRanking(factors=20, regularization=0.1, iterations=50)

# Fit the model using the training sparse content-person matrix
alpha = 15
data = (items_train * alpha).astype('double')
bpr_model.fit(data)

Fit Logistic Matrix Factorization model

In [52]:
# Initialize LogisticMatrixFactorization (LFM) model with initial parameters
lmf_model = LogisticMatrixFactorization(factors=20, regularization=0.1, iterations=50)

# Fit the model using the training sparse content-person matrix
alpha = 15
data = (items_train * alpha).astype('double')
lmf_model.fit(data)
100%|██████████| 50/50 [00:02<00:00, 20.75it/s]
In [53]:
# Get the trained user and item vectors. Convert them to csr matrices

#AlS model
user_vecs_ALS = sparse.csr_matrix(als_model.user_factors)
item_vecs_ALS = sparse.csr_matrix(als_model.item_factors)

#BPR model
user_vecs_BPR = sparse.csr_matrix(bpr_model.user_factors)
item_vecs_BPR = sparse.csr_matrix(bpr_model.item_factors)

#LMF model
user_vecs_LMF = sparse.csr_matrix(lmf_model.user_factors)
item_vecs_LMF = sparse.csr_matrix(lmf_model.item_factors)

Evaluate the Recommender System

Check if the order of recommendations given for each user matches the items they ended up purchasing. The metric that will be used for this evaluation is the area under the Receiver Operating Characteristic (or ROC) curve. A greater area under the curve means that the system is recommending items that end up being purchased near the top of the list of recommended items.

In [54]:
# Helper function to calculate the mean area under the curve (AUC) 
def auc_score(predictions, test):
    fpr, tpr, thresholds = roc_curve(test, predictions)
    return auc(fpr, tpr)
In [55]:
# Function to calculate the AUC for each user in the training set that had at least one item masked
# Also calculate AUC for the most popular items for the users to compare

def calc_mean_auc(training_set, altered_persons, predictions, test_set):
    # An empty list to store the AUC for each user that had an item removed from the training set
    store_auc = [] 

    # To store popular AUC scores
    popularity_auc = [] 

    # Get sum of item iteractions to find most popular
    pop_contents = np.array(test_set.sum(axis = 1)).reshape(-1)
    content_vecs = predictions[1]

    # Iterate through each user that had an item altered
    for person in altered_persons: 

            # Get the training set column
            training_column = training_set[:,person].toarray().reshape(-1) 

            # Find where the interaction had not yet occurred
            zero_inds = np.where(training_column == 0)


            # Get the predicted values based on our user/item vectors
            person_vec = predictions[0][person,:]
            pred = person_vec.dot(content_vecs).toarray()[0,zero_inds].reshape(-1)

            # Get only the items that were originally zero
            # Select all ratings from the MF prediction for this user that originally had no iteraction
            actual = test_set[:,person].toarray()[zero_inds,0].reshape(-1)

            # Select the binarized yes/no interaction pairs from the original full data
            # that align with the same pairs in training
            # Get the item popularity for our chosen items
            pop = pop_contents[zero_inds]

            # Calculate AUC for the given user and store
            store_auc.append(auc_score(pred, actual)) 

            # Calculate AUC using most popular and score
            popularity_auc.append(auc_score(pop, actual))
            
            # left-hand side value is the auc score from the chosen model
            # right-hand side value is the auc score from the popular model
            return float('%.3f'%np.mean(store_auc)), float('%.3f'%np.mean(popularity_auc))
In [56]:
# Apply function - ALS model
calc_mean_auc(items_train, item_user_altered,[user_vecs_ALS, item_vecs_ALS.T], items_test)
Out[56]:
(0.814, 0.742)
In [57]:
# Apply function - BPR model
calc_mean_auc(items_train, item_user_altered,[user_vecs_BPR, item_vecs_BPR.T], items_test)
Out[57]:
(0.52, 0.742)
In [58]:
# Apply function - LMF model
calc_mean_auc(items_train, item_user_altered,[user_vecs_LMF, item_vecs_LMF.T], items_test)
Out[58]:
(0.682, 0.742)
In [59]:
# Plot initial results
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])

als_score, pop_score  = calc_mean_auc(items_train, item_user_altered,[user_vecs_ALS, item_vecs_ALS.T], items_test)
pbr_score, pop_score  = calc_mean_auc(items_train, item_user_altered,[user_vecs_BPR, item_vecs_BPR.T], items_test)
lmf_score, pop_score = calc_mean_auc(items_train, item_user_altered,[user_vecs_LMF, item_vecs_LMF.T], items_test )


models_names = ['ALS', 'BPR', 'LMF']
models_scores = [als_score, pbr_score, lmf_score]
ax.bar(models_names,models_scores)
plt.title("Recommendation models initial AUC scores")
plt.show()

Hyper-parameters tuning

In [60]:
# hyper-parameters tuning function
def tune_hp(recomendation_model, sparse_item_user_matrix):

    # define 
    best_score = 0
    best_params = {'alpha': 0, 'latent_factors': 0, 'regularizations': 0, 'iterations': 0}
    
    # define parameters grid
    alphas = [10, 15, 20, 25, 30,35, 40]
    latent_factors = [10, 20, 30, 40, 50, 60]
    regularizations = [0.01, 0.05, 0.1, 0.5, 1.1, 1.5]
    iterations = [10, 20, 30, 40, 50, 60]
    
    # get training and validation sets
    items_train, items_test, item_user_altered = make_train(sparse_item_user_matrix, pct_test = 0.2)
    
    # interate over parameters
    for alpha in alphas:
        for factor in latent_factors:
            for regularization in regularizations:
                for iteration in iterations:

                    # create data to train the model
                    data = (items_train * alpha).astype('double')

                    # initialize model
                    model = recomendation_model(factors = factor, regularization = regularization, iterations = iteration)

                    # fit model
                    model.fit(data, show_progress = False)

                    # create vectors
                    user_vecs = sparse.csr_matrix(model.user_factors)
                    item_vecs = sparse.csr_matrix(model.item_factors)

                    # get score
                    score = calc_mean_auc(items_train, item_user_altered,[user_vecs, item_vecs.T], items_test)[0]

                    # update parameters if the score is higher
                    if score > best_score:
                        best_params['alpha'] = alpha
                        best_params['latent_factors'] = factor
                        best_params['regularizations'] = regularization
                        best_params['iterations'] = iteration
                        best_score = score
    
    # print results
    print("Best score:", score)
    message = "Best parameters: alpha: {}, factors: {}, regulation: {}, interactions: {}"
    print(message.format(best_params['alpha'], 
                         best_params['latent_factors'],
                         best_params['regularizations'],
                         best_params['iterations']))
    # return results   
    return score, best_params
# THIS CELL TAKES APROX. 1 HOUR TO RUN. THE OUTPUT IS IN THE NEXT CELL #Call function and find better parameters for ALS score, best_params = tune_hp(AlternatingLeastSquares, sparse_item_user)

As the cell above takes a long time to run, it will be kept as a markdown and the results displayed here.

For the Alternating Least Squares model, the optimum results found are:

  • Best score: 0.817
  • Best parameters: alpha: 15, factors: 50, regulation: 0.1, interactions: 40
# THIS CELL TAKES APROX. 1 HOUR TO RUN. THE OUTPUT IS IN THE NEXT CELL # Call function and find better parameters for LMF score, best_params = tune_hp(LogisticMatrixFactorization, sparse_item_user)

As the cell above takes a long time to run, it will be kept as a markdown and the results displayed here.

For the Logistic Matrix Factorization, the optimum results found are:

  • Best score: 0.731
  • Best parameters: alpha: 40, factors: 60, regulation: 0.05, interactions: 50
# THIS CELL TAKES APROX. 1 HOUR TO RUN. THE OUTPUT IS IN THE NEXT CELL # Call function and find better parameters for BPR score, best_params = tune_hp(BayesianPersonalizedRanking, sparse_item_user)

As the cell above takes a long time to run, it will be kept as a markdown and the results displayed here.

For the Bayesian Personalized Ranking, the optimum results found are:

  • Best score: 0.603
  • Best parameters: alpha: 20, factors: 40, regulation: 0.01, interactions: 60
In [73]:
# Plot final results
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])


models_names = ['ALS', 'BPR', 'LMF']
models_scores = [81.7, 60.3, 73.1]
ax.bar(models_names,models_scores)
plt.title("Recommendation models initial AUC scores")
plt.show()
In [74]:
# Train model with the optmium parameters

# get training and validation sets
items_train, items_test, item_user_altered = make_train(sparse_item_user, pct_test = 0.2)

# Initialize the Alternating Least Squares (ALS) model with initial parameters
model = AlternatingLeastSquares(factors=50, regularization=0.1, iterations=40)

# Fit the model using the training sparse content-person matrix
alpha = 15
data = (items_train * alpha).astype('double')
model.fit(data)

Find similar items

In [75]:
# Pick a product and find its most similar products and compare their descriptions 

# Encoded Stock code of a product (randomly selected) 
item_id = 2706

# Number of similiar products to find (the number - 1)
n_similar = 11

# Get the customer and item vectors from trained model
user_vecs = model.user_factors
item_vecs = model.item_factors

# Calculate the vector norms
item_norms = np.sqrt((item_vecs * item_vecs).sum(axis=1))

# Calculate the similarity score
scores = item_vecs.dot(item_vecs[item_id]) /item_norms

# Get the top 10 most similar items
top_idx = np.argpartition(scores, -n_similar)[-n_similar:]

# Create a list of content-score tuples of most similar items
similar = sorted(zip(top_idx, scores[top_idx] / item_norms[item_id]), key=lambda x: -x[1])
In [76]:
# Get product stockcode
item_stock_code = grouped_df[grouped_df['Stock_Code'] == item_id]['StockCode'].unique()[0]

# Get product description
item_description = df1[df1['StockCode'] == item_stock_code]['Description'].unique().tolist()

# Print product description
print("Most similar products to: ", item_description[0], ":\n")

for item in similar[1:]:
    idx, score = item
    
    # Get stock code of the similiar products
    stock_code = grouped_df['StockCode'].loc[grouped_df['Stock_Code'] == idx].iloc[0]
    
    # Get products descriptions 
    similar_products =df1[df1['StockCode']==stock_code]['Description'].unique().tolist()[0]
    
    print(similar_products)
Most similar products to:  SMALL MARSHMALLOWS PINK BOWL :

SMALL CHOCOLATES PINK BOWL
SMALL DOLLY MIX DESIGN ORANGE BOWL
BISCUITS SMALL BOWL LIGHT BLUE
RED PUDDING SPOON
PACK OF 20 SKULL PAPER NAPKINS
PACK OF 6 SKULL PAPER PLATES
PACK OF 6 SKULL PAPER CUPS
DOG BOWL CHASING BALL DESIGN
RED RETROSPOT BOWL
EDWARDIAN DROP EARRINGS JET BLACK

It is possible to see that the recommended similar products are very related to the original product.

Recommend items to customers

In [77]:
# Create function return the top recommendations based on the customer/item vectors
# for items never interacted with before for any given person

def recommend(user_id, sparse_user_item, user_vecs, item_vecs, num_items=10):
    
    # Get the interactions scores from the sparse user item matrix
    user_interactions = sparse_user_item[user_id, :].toarray()
    
    # Add 1 to everything, so that items with no interaction yet become equal to 1
    user_interactions = user_interactions.reshape(-1) + 1
    
    # Make items already interacted zero
    user_interactions[user_interactions > 1] = 0
    
    # Get dot product of user vector and all item vectors
    rec_vector = user_vecs[user_id,:].dot(item_vecs.T).toarray()
    
    # Scale the recommendation vector between 0 and 1
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    
    # Items already interacted have their recommendation multiplied by zero
    recommend_vector = user_interactions * rec_vector_scaled
    
    # Sort the indices of the item into order of best recommendations
    content_idx = np.argsort(recommend_vector)[::-1][:num_items]
    
    # Start empty list to store item descriptions and scores
    descriptions = []
    scores = []
    
    # Append descriptions and scores to the list
    for idx in content_idx:
        # get stock code of the the product
        stock_code = grouped_df['StockCode'].loc[grouped_df['Stock_Code'] == idx].iloc[0]

        # get products descriptions
        item_description = df1[df1['StockCode']==stock_code]['Description'].unique().tolist()[0]

        descriptions.append(item_description)
        scores.append(recommend_vector[idx])
    
    # Create dataframe with the resulting data
    recommendations = pd.DataFrame({'Descriptions': descriptions, 'score': scores})
    
    return recommendations
In [78]:
# Get the trained user and item vectors. Convert them to csr matrices
user_vecs = sparse.csr_matrix(model.user_factors)
item_vecs = sparse.csr_matrix(model.item_factors)

# Create recommendations for customer with encoded id = 1838 (randomly chosen)
user_id = 1993

# Make recomendations
recommendations = recommend(user_id, sparse_user_item, user_vecs, item_vecs)

# Print recomendations
print(recommendations)
                        Descriptions     score
0      LARGE CERAMIC TOP STORAGE JAR  0.776905
1                  REGENCY CAKE FORK  0.752191
2       PARISIENNE JEWELLERY DRAWER   0.741099
3     SILVER PLATE CANDLE BOWL SMALL  0.739686
4           PARISIENNE CURIO CABINET  0.732049
5         PACK OF SIX LED TEA LIGHTS  0.731849
6     MEDIUM CERAMIC TOP STORAGE JAR  0.698534
7      RED ENCHANTED FOREST PLACEMAT  0.696665
8              PARISIENNE SEWING BOX  0.693496
9  HAIRCLIPS FORTIES FABRIC ASSORTED  0.692437
In [79]:
# Check recommendations by comparing them to the 10 items this user has most interacted  with
most_bought_items = grouped_df[grouped_df['Customer_ID'] == user_id].sort_values(by=['Quantity'], ascending=False).head(10)

# Get item descriptions based on its StockCode
description_list = []
for index, row in most_bought_items.iterrows():
    code = str(row['StockCode'])
    description_list.append(df1[df1['StockCode'] == code]['Description'].unique()[0])

# Add item descriptions to the dataframe and filter relevant columns
most_bought_items['Description'] = description_list
most_bought_items = most_bought_items[['Description', 'Quantity']]
                     
# Show the 10 items this user has most interacted  with
most_bought_items
Out[79]:
Description Quantity
132309 GLASS APOTHECARY BOTTLE TONIC 12.0
132310 GLASS TWIST BON BON JAR 12.0
132308 GLASS BEURRE DISH 8.0
132311 GLASS SONGBIRD STORAGE JAR 8.0
132312 SET OF 16 VINTAGE ROSE CUTLERY 1.0
132313 SET OF 16 VINTAGE PISTACHIO CUTLERY 1.0
132314 SET OF 16 VINTAGE RED CUTLERY 1.0
132315 SET OF 16 VINTAGE BLACK CUTLERY 1.0
132316 SET OF 16 VINTAGE SKY BLUE CUTLERY 1.0

As it is possible to notice, the most purchased products from this user and the recommended products have very related descriptions.

New customers

As new users do not yet have interactions for the recommendation system to know their personal preferences, and as the only data available in the dataset about users is for their countries, first-time customers will be recommended the products that are currently the most sold in their regions.

In [80]:
# function to recommend the top 10 most recently sold products for a given country whithin a given number of last months
def recomendations_new_users(country, months):
    
    # filter for the entered country
    new_users_rec_df = df1[df1['Country'] == country]
    
    # get initial month
    initial_month = pd.Series(df1['Month_Year'].unique()).sort_values(ascending=False).iloc[months]
    
    # filter for recent purchases
    new_users_rec_df = new_users_rec_df[new_users_rec_df['Month_Year'] > initial_month]
    
    # select most sold products
    recommended_products_codes = new_users_rec_df.groupby(['StockCode'])['Quantity'].sum().sort_values(ascending=False)
    recommended_products_codes = recommended_products_codes.head(10).index.tolist()

    # initialize empty list
    recommended_products_names = []
    
    # get the names of the recommended products
    for stock_code in recommended_products_codes:
        
        product_description = new_users_rec_df[new_users_rec_df['StockCode'] == stock_code]['Description'].unique()[0]
        recommended_products_names.append(product_description)
        
    return recommended_products_names
In [81]:
# Recommend products to a new customer from United Kingdom based on the most sold products from the last 2 months
recomendations_new_users('United Kingdom', 2)
Out[81]:
['POPCORN HOLDER',
 'RABBIT NIGHT LIGHT',
 "PAPER CHAIN KIT 50'S CHRISTMAS ",
 'JUMBO BAG RED RETROSPOT',
 'ASSORTED COLOUR BIRD ORNAMENT',
 'WOODEN STAR CHRISTMAS SCANDINAVIAN',
 'DISCO BALL CHRISTMAS DECORATION',
 'WORLD WAR 2 GLIDERS ASSTD DESIGNS',
 'WOODEN HEART CHRISTMAS SCANDINAVIAN',
 'CREAM HANGING HEART T-LIGHT HOLDER']

Most of the recommended products are Christmas items, since the purchases made in the last two months of the dataframe are close to Christmas.